Last updated: 2017-10-11 21:59:31
This is an analysis of potential data problems in the Pittsburgh Police Incident Blotter Archive.
These are the R packages used in this analysis:
library(rmarkdown)
library(knitr)
library(tidyverse)
library(lubridate)
library(viridis)
library(ggmap)
library(scales)
Refer to the Exploratory Analysis post for an intro on working with this data
Does the Zone the incident is assigned to match the geolocations the incident is reported at?
city_map_11 <- get_map(location = "Oakland, Pittsburgh, PA",
zoom = 11,
maptype = "toner",
source = "stamen",
messaging = FALSE)
city_map_11 <- ggmap(city_map_11)
df_map_zones <- df %>%
filter(zone %in% c(1:6)) %>%
select(zone, x, y) %>%
mutate(zone = as.factor(paste("Zone:", zone)))
In this view, the data looks accurate
city_map_11 +
geom_point(data = df_map_zones, aes(x, y, color = as.factor(zone)), alpha = .3, size = .7, show.legend = FALSE) +
scale_color_viridis(discrete = TRUE) +
labs(title = "Pittsburgh Police Incident Data",
x = NULL,
y = NULL) +
theme(legend.position = "bottom",
legend.direction = "horizontal",
axis.text = element_blank())
## Warning: Removed 22714 rows containing missing values (geom_point).
One note of concern is that 5% of the data is outside the x,y coordinates in this map
paste0(round(22716 / nrow(df_map_zones), 2) * 100, "%")
## [1] "5%"
Faceting the data by Zone to separate it, the data looks less accurate
city_map_12 <- get_map(location = "Oakland, Pittsburgh, PA",
zoom = 12,
maptype = "toner",
source = "stamen",
messaging = FALSE)
city_map_12 <- ggmap(city_map_12)
city_map_12 +
geom_point(data = df_map_zones, aes(x, y, color = zone), alpha = .3, size = 1) +
facet_wrap(~zone, nrow = 2) +
scale_color_viridis(discrete = TRUE) +
labs(title = "Pittsburgh Crime Incident Data",
x = NULL,
y = NULL) +
guides(alpha = FALSE,
color = FALSE) +
theme(legend.position = "bottom",
legend.direction = "horizontal",
axis.text = element_blank())
## Warning: Removed 25078 rows containing missing values (geom_point).
The reporting problems appear most prominently in 2005 and 2006
city_map_facets <- ggmap(get_map("North Oakland, Pittsburgh, PA",
zoom = 12,
maptype = "toner-lite",
source = "stamen"))
df_map_zones_year <- df %>%
select(year, zone, x, y) %>%
filter(zone %in% c(1:6)) %>%
mutate(zone = as.factor(paste("Zone", zone)))
city_map_facets +
geom_point(data = df_map_zones_year, aes(x, y), alpha = .3, size = 1) +
facet_grid(zone~year) +
scale_color_viridis(discrete = TRUE) +
labs(title = "Pittsburgh Crime Incident Data",
x = NULL,
y = NULL) +
guides(alpha = FALSE,
color = FALSE) +
theme(legend.position = "bottom",
legend.direction = "horizontal",
axis.text = element_blank(),
strip.text = element_text(size = 9),
strip.text.y = element_text(angle=0))
## Warning: Removed 27544 rows containing missing values (geom_point).
Does the data for the Neighborhoods look the same?
Let’s just look at the top 10 Neighorhoods in terms of # of incidents
neighborhoods_top10 <- df %>%
filter(!is.na(neighborhood)) %>%
select(neighborhood) %>%
group_by(neighborhood) %>%
count() %>%
arrange(-n) %>%
ungroup() %>%
mutate(neighborhood = factor(neighborhood)) %>%
top_n(n = 10, wt = n)
df_nbh <- df %>%
filter(neighborhood %in% neighborhoods_top10$neighborhood) %>%
select(neighborhood, x, y) %>%
mutate(neighborhood = factor(neighborhood, levels = neighborhoods_top10$neighborhood))
Looking at the data from the top 10 Neighborhoods in one map:
city_map_12 +
geom_point(data = df_nbh, aes(x, y, color = neighborhood), alpha = .3, size = 1) +
scale_color_viridis(discrete = TRUE) +
labs(title = "Pittsburgh Police Incident Data",
x = NULL,
y = NULL) +
guides(alpha = FALSE,
color = FALSE) +
theme(axis.text = element_blank())
## Warning: Removed 4820 rows containing missing values (geom_point).
The borders between Neighborhoods are significantly less clearly delineated than the borders for the Zones were
How does the data look when it is faceted by Neighborhood?
city_map_12 +
geom_point(data = df_nbh, aes(x, y, color = neighborhood), alpha = .3, size = 1) +
facet_wrap(~neighborhood, ncol = 5) +
scale_color_viridis(discrete = TRUE) +
labs(title = "Pittsburgh Police Incident Data",
x = NULL,
y = NULL) +
guides(color = FALSE) +
theme(axis.text = element_blank(),
strip.text = element_text(size = 6))
## Warning: Removed 4820 rows containing missing values (geom_point).
There appears to be significant data quality issues with the Neighborhood designations
Many Neighborhoods have incidents reported in multiple Zones
df_zone_nbh <- df %>%
filter(zone %in% c(1:6)) %>%
group_by(zone, neighborhood) %>%
count() %>%
arrange(-n) %>%
ungroup() %>%
mutate(neighborhood = factor(neighborhood))
ggplot(df_zone_nbh, aes(zone, reorder(neighborhood, n), fill = n)) +
geom_tile(color = "grey") +
facet_wrap(~zone,
nrow = 1,
scales = "free_x") +
coord_equal() +
labs(x = "Zone",
y = "Neighborhood",
title = "Pittsburgh Police Incident Data") +
guides(fill = guide_colorbar("Count of Incidents")) +
scale_fill_viridis() +
scale_x_discrete(expand = c(0,0)) +
scale_y_discrete(expand = c(0,0)) +
theme(axis.text = element_text(size = 9),
strip.text = element_text(size = 9),
panel.grid = element_blank())
What are the drivers of the incorrect assignments?
First, we need to identify the correct Zone for each Neighborhood
My method is to find the Zone with the highest # of incidents for a Neighborhood
df_correct_zone <- df %>%
mutate(key = paste(zone, neighborhood)) %>%
select(key, zone, neighborhood) %>%
group_by(key, zone, neighborhood) %>%
count() %>%
arrange(neighborhood, -n) %>%
group_by(neighborhood) %>%
slice(1) %>%
ungroup() %>%
arrange(zone) %>%
mutate(correct_zone = zone) %>%
select(correct_zone, neighborhood)
Testing this method for Zone 1:
df_correct_zone %>%
filter(correct_zone == "1")
## # A tibble: 18 x 2
## correct_zone neighborhood
## <chr> <chr>
## 1 1 Allegheny Center
## 2 1 Allegheny West
## 3 1 Brighton Heights
## 4 1 California-Kirkbride
## 5 1 Central Northside
## 6 1 Chateau
## 7 1 East Allegheny
## 8 1 Fineview
## 9 1 Manchester
## 10 1 Marshall-Shadeland
## 11 1 North Shore
## 12 1 Northview Heights
## 13 1 Perry North
## 14 1 Perry South
## 15 1 Spring Garden
## 16 1 Spring Hill-City View
## 17 1 Summer Hill
## 18 1 Troy Hill-Herrs Island
Testing this method for Zone 2:
df_correct_zone %>%
filter(correct_zone == "2")
## # A tibble: 12 x 2
## correct_zone neighborhood
## <chr> <chr>
## 1 2 Bedford Dwellings
## 2 2 Bluff
## 3 2 Central Lawrenceville
## 4 2 Crawford-Roberts
## 5 2 Golden Triangle/Civic Arena
## 6 2 Lower Lawrenceville
## 7 2 Middle Hill
## 8 2 Polish Hill
## 9 2 Strip District
## 10 2 Terrace Village
## 11 2 Upper Hill
## 12 2 Upper Lawrenceville
This approximation appears to be correct
Then, calculate how many of a Neighborhood’s incidents were reported in the correct zone
df_zones_nbh <- df %>%
mutate(key = paste(zone, neighborhood)) %>%
select(key, zone, neighborhood) %>%
filter(!is.na(zone),
!is.na(neighborhood),
!(zone %in% c("OSC", "OUTSIDE")),
!(neighborhood %in% c("Outside State", "Outside County", "Outside City"))) %>%
group_by(key, zone, neighborhood) %>%
count() %>%
left_join(., df_correct_zone) %>%
mutate(flag = ifelse(zone == correct_zone, "Correct", "Incorrect")) %>%
group_by(zone, neighborhood, flag) %>%
summarize(n = sum(n)) %>%
spread(key = flag,
value = n,
fill = 0) %>%
group_by(neighborhood) %>%
summarize(Correct = sum(Correct),
Incorrect = sum(Incorrect),
count = Correct + Incorrect,
percent_correct = round(Correct/count, 2)) %>%
left_join(., df_correct_zone)
## Joining, by = "neighborhood"
## Joining, by = "neighborhood"
ggplot(df_zones_nbh, aes(count, percent_correct, label = neighborhood, fill = correct_zone)) +
geom_label(size = 2) +
scale_y_continuous(labels = percent) +
scale_x_continuous(labels = comma) +
labs(x = "Count of Incidents",
y = "Percent Reported in Correct Zone",
title = "Nieghborhood-Zone Reporting Analysis") +
guides(fill = guide_legend(title = "Correct Zone")) +
theme(axis.title = element_text(size = 10))
Zone 6 appears to have the lowest % of correct Zone assignments. Golden Triangle/Civic Arena and Bloomfield appear to have most of the incorrect assignments
Our original question was: What are the drivers of the incorrect assignments?
df_bad_zones_helper1 <- df %>%
mutate(key = paste(zone, neighborhood)) %>%
select(key, zone, neighborhood) %>%
filter(!is.na(zone),
!is.na(neighborhood),
(zone %in% 1:6),
!(neighborhood %in% c("Outside State", "Outside County", "Outside City"))) %>%
group_by(key, zone, neighborhood) %>%
count() %>%
left_join(., df_correct_zone) %>%
mutate(flag = ifelse(zone == correct_zone, "Correct", "Incorrect")) %>%
group_by(zone, neighborhood, flag) %>%
summarize(n = sum(n)) %>%
filter(flag == "Incorrect") %>%
group_by(neighborhood) %>%
summarize(n = sum(n)) %>%
arrange(n)
## Joining, by = "neighborhood"
df_bad_zones_helper2 <- df %>%
mutate(key = paste(zone, neighborhood)) %>%
select(key, zone, neighborhood) %>%
filter(!is.na(zone),
!is.na(neighborhood),
(zone %in% 1:6),
!(neighborhood %in% c("Outside State", "Outside County", "Outside City"))) %>%
group_by(key, zone, neighborhood) %>%
count() %>%
left_join(., df_correct_zone) %>%
mutate(flag = ifelse(zone == correct_zone, "Correct", "Incorrect")) %>%
group_by(zone, neighborhood, flag) %>%
summarize(n = sum(n)) %>%
filter(flag == "Incorrect") %>%
group_by(zone) %>%
summarize(n = sum(n)) %>%
arrange(-n)
## Joining, by = "neighborhood"
df_bad_zones <- df %>%
mutate(key = paste(zone, neighborhood)) %>%
select(key, zone, neighborhood) %>%
filter(!is.na(zone),
!is.na(neighborhood),
(zone %in% 1:6),
!(neighborhood %in% c("Outside State", "Outside County", "Outside City"))) %>%
group_by(key, zone, neighborhood) %>%
count() %>%
left_join(., df_correct_zone) %>%
mutate(flag = ifelse(zone == correct_zone, "Correct", "Incorrect")) %>%
group_by(zone, neighborhood, flag) %>%
summarize(n = sum(n)) %>%
filter(flag == "Incorrect") %>%
ungroup() %>%
mutate(zone = factor(zone, levels = df_bad_zones_helper2$zone),
neighborhood = factor(neighborhood, levels = df_bad_zones_helper1$neighborhood))
## Joining, by = "neighborhood"
ggplot(df_bad_zones, aes(zone, neighborhood, fill = n)) +
geom_tile() +
coord_equal() +
scale_fill_viridis() +
theme(panel.grid = element_blank()) +
scale_y_discrete(expand = c(0,0)) +
scale_x_discrete(expand = c(0,0)) +
labs(x = "Zone",
y = "Neighborhood",
title = "Incorrect Neighborhood-Zone Assignments") +
guides(fill = guide_colorbar("Count of Incidents")) +
theme(axis.text = element_text(size = 8))
Brookline has the most incidents reported in the incorrect zone
Zone 3 is a major driver of incorrect Zone assignments
Again, the reporting problems appear most prominently in 2005 and 2006
city_map_facets <- ggmap(get_map("North Oakland, Pittsburgh, PA",
zoom = 12,
maptype = "toner-lite",
source = "stamen"))
df_nbh_year <- df %>%
filter(neighborhood %in% c("Golden Triangle/Civic Arena",
"Bloomfield",
"North Oakland",
"South Side Flats",
"Brookline",
"Homewood South",
"Central Oakland",
"Lincoln-Lemington Lamar",
"Carrick",
"Shadyside")) %>%
select(year, neighborhood, x, y) %>%
mutate(neighborhood = factor(neighborhood, levels = c("Golden Triangle/Civic Arena",
"Bloomfield",
"North Oakland",
"South Side Flats",
"Brookline",
"Homewood South",
"Central Oakland",
"Lincoln-Lemington Lamar",
"Carrick",
"Shadyside")))
city_map_facets +
geom_point(data = df_nbh_year, aes(x, y), alpha = .3, size = 1) +
facet_grid(neighborhood~year) +
scale_color_viridis(discrete = TRUE) +
labs(title = "Pittsburgh Crime Incident Data",
x = NULL,
y = NULL) +
guides(alpha = FALSE,
color = FALSE) +
theme(axis.text = element_blank(),
strip.text = element_text(size = 9),
strip.text.y = element_text(angle=0))
## Warning: Removed 4705 rows containing missing values (geom_point).
Zone 6 was closed in 2003, and reopened in 2008.
Therefore, we should expect that there are 0 incidents between 2005 (when the data begins) and 2008 in Zone 6
df %>%
filter(zone == 6, year <= 2008) %>%
select(zone, date, description) %>%
group_by(zone) %>%
count()
## # A tibble: 1 x 2
## # Groups: zone [1]
## zone n
## <chr> <int>
## 1 6 2889
This shows that there are 2,889 incidents in this period for Zone 6
df %>%
filter(zone == 6, year <= 2008) %>%
select(zone, date, description) %>%
head()
## # A tibble: 6 x 3
## zone date description
## <chr> <date> <chr>
## 1 6 2005-07-09 <NA>
## 2 6 2005-10-05 <NA>
## 3 6 2005-10-05 IDENTITY THEFT
## 4 6 2005-10-28 <NA>
## 5 6 2005-04-26 INVOLUNTARY DEV SEXUAL INTER
## 6 6 2005-03-24 <NA>
Where did these incidents occur?
z6_map <- get_map(location = "Mount Washington, Pittsburgh, PA",
zoom = 12,
maptype = "toner",
source = "stamen",
messaging = FALSE)
df_map_z6 <- df %>%
filter(zone == 6, year <= 2008)
ggmap(z6_map) +
geom_point(data = df_map_z6, aes(x, y, color = zone), alpha = .3, size = 1) +
scale_color_viridis(discrete = TRUE) +
guides(color = FALSE) +
labs(x = NULL,
y = NULL,
title = "Zone 6 Incidents 2005-2008") +
theme(axis.text = element_blank())
## Warning: Removed 170 rows containing missing values (geom_point).
This broadly lines up with the borders of Zone 6
Incidents reported in Zone 6 before Zone 6 was reopened in 2008 appear to be geolocated appropriately
I think there should be a special flag for these incidents. Assigning incidents to a Zone that did not exist at the time seems confusing